Data preparation (continued)

In the examples below, we’ll be working with a random sample of 10,000 public Facebook posts by Members of the U.S. Congress. The overall question that we will be trying to answer is: what type of posts gets more likes?

library(DBI)
db <- dbConnect(RSQLite::SQLite(), "~/data/facebook-db.sqlite")

df <- dbGetQuery(db, 
                 "SELECT posts.screen_name, date, posts.type AS post_type, 
                      message, likes_count, comments_count, shares_count,
                      love_count, haha_count, wow_count, angry_count,
                      sad_count, gender, congress.type, party
                 FROM posts JOIN congress
                 ON congress.screen_name = posts.screen_name
                 ORDER BY RANDOM()
                 LIMIT 10000")
## Warning in rsqlite_fetch(res@ptr, n = n): Column `screen_name`: mixed type,
## first seen values of type string, coercing other values of type real
# also available as:
df <- read.csv("fb-congress-data.csv", stringsAsFactors=FALSE)

And now we load the ggplot2 package:

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

Customizing axes

# changing axis titles
p <- ggplot(df, aes(x=likes_count))
p + geom_histogram() + scale_x_log10("Number of likes") +
    scale_y_continuous("Post count")
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

# changing axis limits (continuous variables)
p + geom_histogram() + scale_y_continuous("Post count") +
  scale_x_log10("Number of likes", limits=c(1, 100000)) ## in log scale
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 27 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

# changing axis breaks and labels
p <- ggplot(df, aes(x=likes_count))
p + geom_histogram() + scale_x_log10(breaks=c(1, 10, 100, 1000))
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

p + geom_histogram() + scale_x_log10(breaks=c(1, 10, 100, 1000, 10000),
    labels=c(1, 10, 100, "1K", "10K"))
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

# changing axis breaks and labels (categorical variables)
p <- ggplot(df, aes(x=party))
p + geom_bar() + scale_x_discrete(labels=c("D", "I", "R"))

# changing axis breaks and labels (date variables)
counts <- dbGetQuery(db, 
                     "SELECT date, COUNT(1) as post_count
                     FROM posts 
                     GROUP BY date
                     ORDER BY date")

p <- ggplot(counts, aes(x=as.Date(date), y=post_count))
p + geom_line() ## line: posts per day

p + geom_line() + scale_x_date(date_breaks="2 months")

library(scales) # loading additional library scales
p + geom_line() + scale_x_date(date_breaks="2 months", 
    labels = date_format("%Y-%m"))

Customizing scales (properties of geoms) and legends

Note that all the modifications below can be linked to a factor variable, as in previous script, and then it becomes an aesthetics; or it can go out as a property of the geom and then applied to all of them.

p <- ggplot(df, aes(x=likes_count))
p + scale_x_log10() + geom_histogram(color="red") # border of geoms
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

p + scale_x_log10() + geom_histogram(fill="red") # area of geoms
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + scale_x_log10() + scale_y_log10() + geom_point(color="red")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

p + scale_x_log10() + scale_y_log10() + geom_point(shape=15)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

p + scale_x_log10() + scale_y_log10() + geom_point(size=1)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

# for a complete list for each geom, type ?geom_point, ?geom_line, ...

Scales of aesthetics can also be modified manually very easily

p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_point(aes(color=post_type)) + scale_x_log10() + scale_y_log10() +
    scale_color_manual("Post type",
        limits = c("link", "status", "video", "photo"),
        values=c("blue", "grey", "red", "yellow"))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 66 rows containing missing values (geom_point).

p + geom_point(aes(shape=post_type)) + scale_x_log10() + scale_y_log10() +
    scale_shape_manual("Post type",
        limits = c("link", "status", "video", "photo"),
        values=c(1, 3, 4, 5)) ## anything from 1 to 25
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 66 rows containing missing values (geom_point).

Finally, we can also modify the labels in the legend:

p + geom_point(aes(shape=post_type)) + scale_x_log10() + scale_y_log10() +
    scale_shape_manual("Post type",
        limits = c("link", "status", "video", "photo"),
        labels = c("Link", "Status", "Video", "Photo"),
        values=c(1, 3, 4, 5)) ## anything from 1 to 25
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 66 rows containing missing values (geom_point).

Customizing facets

Multiple plots (1 factor variable):

p + geom_point() + scale_x_log10() + scale_y_log10() +
    facet_wrap(~post_type)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

p + geom_point() + scale_x_log10() + scale_y_log10() +
    facet_wrap(~post_type, nrow=4)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

p + geom_point() + scale_x_log10() + scale_y_log10() +
    facet_wrap(~post_type, ncol=4)
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

Multiple plots (2 factor variables)

p + geom_point() + scale_x_log10() + scale_y_log10() +
    facet_grid(gender~post_type)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

Changing themes

p + geom_point() + scale_x_log10() + scale_y_log10() + theme_bw()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

p + geom_point() + scale_x_log10() + scale_y_log10() + theme_grey()
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

p + geom_point() + scale_x_log10() + scale_y_log10() + theme_minimal()
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

p + geom_point() + scale_x_log10() + scale_y_log10() + theme_classic()
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

All theme options can be edited manually using ‘theme’, e.g.

p + geom_point() + scale_x_log10() + scale_y_log10() +
    theme( axis.ticks = element_blank() ) # removing axis ticks
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

See ?theme for all possible options. We will see more examples later.

Saving plots

Different ways of doing this, easiest is with ‘ggsave’

pq <- p + geom_point() + scale_x_log10() + scale_y_log10() +
    facet_grid(party~gender)

ggsave(pq, file="grid_plot.pdf", height=6, width=6)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

Multiple types of plots on a grid

library(gridExtra) ## adds more functions to grid
# posts by type of post
p1 <- ggplot(df, aes(x=post_type)) + geom_bar() + theme_minimal() +
    scale_y_continuous("Number of posts") + 
    scale_x_discrete("Type of post") +
    theme (axis.ticks.x = element_blank())

p2 <- ggplot(df, aes(x=gender)) + geom_bar() + theme_minimal() +
     scale_x_discrete("Gender of legislator", 
        labels=c("Female", "Male")) +   
    theme (axis.title.y = element_blank(), axis.text.y = element_blank(),
        axis.ticks = element_blank())

# average number of likes by type of post
p3 <- ggplot(df, aes(x=post_type, y=likes_count)) + 
    stat_summary(fun.y="mean", geom="point", size=5, shape=15) +
    theme_minimal() + scale_y_continuous("Average likes count") +
    scale_x_discrete("Type of post") +
    theme (axis.ticks.x = element_blank()) 

# average number of likes by gender
p4 <- ggplot(df, aes(x=gender, y=likes_count)) + 
    stat_summary(fun.y="mean", geom="point", size=5, shape=15) +
    theme_minimal() + scale_x_discrete("Gender of legislator", 
        labels=c("Female", "Male")) +   
    theme (axis.title.y = element_blank(), axis.text.y = element_blank(),
        axis.ticks = element_blank())

pdf("multiple_plots_grid.pdf", height=6, width=6)
grid.arrange(arrangeGrob(p1, p2, p3, p4, ncol=2, heights=c(0.6, 0.4)))
dev.off()
## quartz_off_screen 
##                 2